/*
//@HEADER
// ************************************************************************
//
//                        Kokkos v. 2.0
//              Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/

#ifndef KOKKOS_EXAMPLE_FENLFUNCTORS_HPP
#define KOKKOS_EXAMPLE_FENLFUNCTORS_HPP

#include <cstdio>

#include <iostream>
#include <fstream>
#include <iomanip>
#include <cstdlib>
#include <cmath>
#include <limits>

#include <Kokkos_Pair.hpp>
#include <Kokkos_UnorderedMap.hpp>

#include <impl/Kokkos_Timer.hpp>

#include <BoxElemFixture.hpp>
#include <HexElement.hpp>
#include <CGSolve.hpp>

//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

namespace Kokkos {
namespace Example {
namespace FENL {

template< class ElemNodeIdView , class CrsGraphType , unsigned ElemNode >
class NodeNodeGraph {
public:

  typedef typename ElemNodeIdView::execution_space  execution_space ;
  typedef pair<unsigned,unsigned> key_type ;

  typedef Kokkos::UnorderedMap< key_type, void , execution_space >  SetType ;
  typedef typename CrsGraphType::row_map_type::non_const_type       RowMapType ;
  typedef Kokkos::View< unsigned ,  execution_space >               UnsignedValue ;

  // Static dimensions of 0 generate compiler warnings or errors.
  typedef Kokkos::View< unsigned*[ElemNode][ElemNode] , execution_space >
    ElemGraphType ;

  struct TagFillNodeSet {};
  struct TagScanNodeCount {};
  struct TagFillGraphEntries {};
  struct TagSortGraphEntries {};
  struct TagFillElementGraph {};

private:

  enum PhaseType { FILL_NODE_SET ,
                   SCAN_NODE_COUNT ,
                   FILL_GRAPH_ENTRIES ,
                   SORT_GRAPH_ENTRIES ,
                   FILL_ELEMENT_GRAPH };

  const unsigned        node_count ;
  const ElemNodeIdView  elem_node_id ;
  UnsignedValue         row_total ;
  RowMapType            row_count ;
  RowMapType            row_map ;
  SetType               node_node_set ;
  PhaseType             phase ;

public:

  CrsGraphType          graph ;
  ElemGraphType         elem_graph ;

  struct Times
  {
    double ratio;
    double fill_node_set;
    double scan_node_count;
    double fill_graph_entries;
    double sort_graph_entries;
    double fill_element_graph;
  };

  NodeNodeGraph( const ElemNodeIdView & arg_elem_node_id ,
                 const unsigned         arg_node_count,
                 Times & results
               )
    : node_count(arg_node_count)
    , elem_node_id( arg_elem_node_id )
    , row_total( "row_total" )
    , row_count(Kokkos::ViewAllocateWithoutInitializing("row_count") , node_count ) // will deep_copy to 0 inside loop
    , row_map( "graph_row_map" , node_count + 1 )
    , node_node_set()
    , phase( FILL_NODE_SET )
    , graph()
    , elem_graph()
   {
      //--------------------------------
      // Guess at capacity required for the map:

      Kokkos::Timer wall_clock ;

      wall_clock.reset();
      phase = FILL_NODE_SET ;

      // upper bound on the capacity
      size_t set_capacity = (28ull * node_count) / 2;
      unsigned failed_insert_count = 0 ;

      do {
        // Zero the row count to restart the fill
        Kokkos::deep_copy( row_count , 0u );

        node_node_set = SetType( ( set_capacity += failed_insert_count ) );

        // May be larger that requested:
        set_capacity = node_node_set.capacity();

        Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space,TagFillNodeSet>(0,elem_node_id.extent(0))
                               , *this
                               , failed_insert_count );

      } while ( failed_insert_count );

      execution_space().fence();
      results.ratio = (double)node_node_set.size() / (double)node_node_set.capacity();
      results.fill_node_set = wall_clock.seconds();
      //--------------------------------

      wall_clock.reset();
      phase = SCAN_NODE_COUNT ;

      // Exclusive scan of row_count into row_map
      // including the final total in the 'node_count + 1' position.
      // Zero the 'row_count' values.
      Kokkos::parallel_scan( node_count , *this );

      // Zero the row count for the fill:
      Kokkos::deep_copy( row_count , 0u );

      unsigned graph_entry_count = 0 ;

      Kokkos::deep_copy( graph_entry_count , row_total );

      // Assign graph's row_map and allocate graph's entries
      graph.row_map = row_map ;
      graph.entries = typename CrsGraphType::entries_type( "graph_entries" , graph_entry_count );

      //--------------------------------
      // Fill graph's entries from the (node,node) set.

      execution_space().fence();
      results.scan_node_count = wall_clock.seconds();

      wall_clock.reset();
      phase = FILL_GRAPH_ENTRIES ;
      Kokkos::parallel_for( node_node_set.capacity() , *this );

      execution_space().fence();
      results.fill_graph_entries = wall_clock.seconds();

      //--------------------------------
      // Done with the temporary sets and arrays
      wall_clock.reset();
      phase = SORT_GRAPH_ENTRIES ;

      row_total = UnsignedValue();
      row_count = RowMapType();
      row_map   = RowMapType();
      node_node_set.clear();

      //--------------------------------

      Kokkos::parallel_for( node_count , *this );

      execution_space().fence();
      results.sort_graph_entries = wall_clock.seconds();

      //--------------------------------
      // Element-to-graph mapping:
      wall_clock.reset();
      phase = FILL_ELEMENT_GRAPH ;
      elem_graph = ElemGraphType("elem_graph", elem_node_id.extent(0) );
      Kokkos::parallel_for( elem_node_id.extent(0) , *this );

      execution_space().fence();
      results.fill_element_graph = wall_clock.seconds();
    }

  //------------------------------------
  // parallel_for: create map and count row length

  KOKKOS_INLINE_FUNCTION
  void operator()( const TagFillNodeSet & , unsigned ielem , unsigned & count ) const
  {
    // Loop over element's (row_local_node,col_local_node) pairs:
    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) {

      const unsigned row_node = elem_node_id( ielem , row_local_node );

      for ( unsigned col_local_node = row_local_node ; col_local_node < elem_node_id.extent(1) ; ++col_local_node ) {

        const unsigned col_node = elem_node_id( ielem , col_local_node );

        // If either node is locally owned then insert the pair into the unordered map:

        if ( row_node < row_count.extent(0) || col_node < row_count.extent(0) ) {

          const key_type key = (row_node < col_node) ? make_pair( row_node, col_node ) : make_pair( col_node, row_node ) ;

          const typename SetType::insert_result result = node_node_set.insert( key );

          // A successfull insert: the first time this pair was added
          if ( result.success() ) {

            // If row node is owned then increment count
            if ( row_node < row_count.extent(0) ) { atomic_fetch_add( & row_count( row_node ) , 1 ); }

            // If column node is owned and not equal to row node then increment count
            if ( col_node < row_count.extent(0) && col_node != row_node ) { atomic_fetch_add( & row_count( col_node ) , 1 ); }
          }
          else if ( result.failed() ) {
            ++count ;
          }
        }
      }
    }
  }

  KOKKOS_INLINE_FUNCTION
  void fill_graph_entries( const unsigned iset ) const
  {
    if ( node_node_set.valid_at(iset) ) {
      // Add each entry to the graph entries.

      const key_type key = node_node_set.key_at(iset) ;
      const unsigned row_node = key.first ;
      const unsigned col_node = key.second ;

      if ( row_node < row_count.extent(0) ) {
        const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 );
        graph.entries( offset ) = col_node ;
      }

      if ( col_node < row_count.extent(0) && col_node != row_node ) {
        const unsigned offset = graph.row_map( col_node ) + atomic_fetch_add( & row_count( col_node ) , 1 );
        graph.entries( offset ) = row_node ;
      }
    }
  }

  KOKKOS_INLINE_FUNCTION
  void sort_graph_entries( const unsigned irow ) const
  {
    const unsigned row_beg = graph.row_map( irow );
    const unsigned row_end = graph.row_map( irow + 1 );
    for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) {
      const unsigned col = graph.entries(i);
      unsigned j = i ;
      for ( ; row_beg < j && col < graph.entries(j-1) ; --j ) {
        graph.entries(j) = graph.entries(j-1);
      }
      graph.entries(j) = col ;
    }
  }

  KOKKOS_INLINE_FUNCTION
  void fill_elem_graph_map( const unsigned ielem ) const
  {
    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) {

      const unsigned row_node = elem_node_id( ielem , row_local_node );

      for ( unsigned col_local_node = 0 ; col_local_node < elem_node_id.extent(1) ; ++col_local_node ) {

        const unsigned col_node = elem_node_id( ielem , col_local_node );

        unsigned entry = ~0u ;

        if ( row_node + 1 < graph.row_map.extent(0) ) {

          const unsigned entry_end = graph.row_map( row_node + 1 );

          entry = graph.row_map( row_node );

          for ( ; entry < entry_end && graph.entries(entry) != col_node ; ++entry );

          if ( entry == entry_end ) entry = ~0u ;
        }

        elem_graph( ielem , row_local_node , col_local_node ) = entry ;
      }
    }
  }

  KOKKOS_INLINE_FUNCTION
  void operator()( const unsigned iwork ) const
  {
/*
    if ( phase == FILL_NODE_SET ) {
      operator()( TagFillNodeSet() , iwork );
    }
    else */
    if ( phase == FILL_GRAPH_ENTRIES ) {
      fill_graph_entries( iwork );
    }
    else if ( phase == SORT_GRAPH_ENTRIES ) {
      sort_graph_entries( iwork );
    }
    else if ( phase == FILL_ELEMENT_GRAPH ) {
      fill_elem_graph_map( iwork );
    }
  }

  //------------------------------------
  // parallel_scan: row offsets

  typedef unsigned value_type ;

  KOKKOS_INLINE_FUNCTION
  void operator()( const unsigned irow , unsigned & update , const bool final ) const
  {
    // exclusive scan
    if ( final ) { row_map( irow ) = update ; }

    update += row_count( irow );

    if ( final ) {
      if ( irow + 1 == row_count.extent(0) ) {
        row_map( irow + 1 ) = update ;
        row_total()         = update ;
      }
    }
  }

  // For the reduce phase:
  KOKKOS_INLINE_FUNCTION
  void init( const TagFillNodeSet & , unsigned & update ) const { update = 0 ; }

  KOKKOS_INLINE_FUNCTION
  void join( const TagFillNodeSet &
           , volatile       unsigned & update
           , volatile const unsigned & input ) const { update += input ; }

  // For the scan phase::
  KOKKOS_INLINE_FUNCTION
  void init( unsigned & update ) const { update = 0 ; }

  KOKKOS_INLINE_FUNCTION
  void join( volatile       unsigned & update
           , volatile const unsigned & input ) const { update += input ; }

  //------------------------------------
};

} /* namespace FENL */
} /* namespace Example */
} /* namespace Kokkos  */

//----------------------------------------------------------------------------

namespace Kokkos {
namespace Example {
namespace FENL {

template< class ElemCompType >
class NodeElemGatherFill {
public:

  typedef typename ElemCompType::execution_space         execution_space ;
  typedef typename ElemCompType::vector_type         vector_type ;
  typedef typename ElemCompType::sparse_matrix_type  sparse_matrix_type ;
  typedef typename ElemCompType::elem_node_type      elem_node_type ;
  typedef typename ElemCompType::elem_vectors_type   elem_vectors_type ;
  typedef typename ElemCompType::elem_matrices_type  elem_matrices_type ;
  typedef typename ElemCompType::elem_graph_type     elem_graph_type ;

  static const unsigned ElemNodeCount = ElemCompType::ElemNodeCount ;

  //------------------------------------

private:

  typedef Kokkos::StaticCrsGraph< unsigned[2] , execution_space >  CrsGraphType ;
  typedef typename CrsGraphType::row_map_type::non_const_type  RowMapType ;
  typedef Kokkos::View< unsigned ,  execution_space >              UnsignedValue ;

  enum PhaseType { FILL_NODE_COUNT ,
                   SCAN_NODE_COUNT ,
                   FILL_GRAPH_ENTRIES ,
                   SORT_GRAPH_ENTRIES ,
                   GATHER_FILL };

  const elem_node_type  elem_node_id ;
  const elem_graph_type elem_graph ;
  UnsignedValue         row_total ;
  RowMapType            row_count ;
  RowMapType            row_map ;
  CrsGraphType          graph ;
  vector_type           residual ;
  sparse_matrix_type    jacobian ;
  elem_vectors_type     elem_residual ;
  elem_matrices_type    elem_jacobian ;
  PhaseType             phase ;

public:

  NodeElemGatherFill()
    : elem_node_id()
    , elem_graph()
    , row_total()
    , row_count()
    , row_map()
    , graph()
    , residual()
    , jacobian()
    , elem_residual()
    , elem_jacobian()
    , phase( FILL_NODE_COUNT )
    {}

  NodeElemGatherFill( const NodeElemGatherFill & rhs )
    : elem_node_id(  rhs.elem_node_id )
    , elem_graph(    rhs.elem_graph )
    , row_total(     rhs.row_total )
    , row_count(     rhs.row_count )
    , row_map(       rhs.row_map )
    , graph(         rhs.graph )
    , residual(      rhs.residual )
    , jacobian(      rhs.jacobian )
    , elem_residual( rhs.elem_residual )
    , elem_jacobian( rhs.elem_jacobian )
    , phase(         rhs.phase )
    {}

  NodeElemGatherFill( const elem_node_type     & arg_elem_node_id ,
                      const elem_graph_type    & arg_elem_graph ,
                      const vector_type        & arg_residual ,
                      const sparse_matrix_type & arg_jacobian ,
                      const elem_vectors_type  & arg_elem_residual ,
                      const elem_matrices_type & arg_elem_jacobian )
    : elem_node_id( arg_elem_node_id )
    , elem_graph( arg_elem_graph )
    , row_total( "row_total" )
    , row_count( "row_count" , arg_residual.extent(0) )
    , row_map( "graph_row_map" , arg_residual.extent(0) + 1 )
    , graph()
    , residual( arg_residual )
    , jacobian( arg_jacobian )
    , elem_residual( arg_elem_residual )
    , elem_jacobian( arg_elem_jacobian )
    , phase( FILL_NODE_COUNT )
    {
      //--------------------------------
      // Count node->element relations

      phase = FILL_NODE_COUNT ;

      Kokkos::parallel_for( elem_node_id.extent(0) , *this );

      //--------------------------------

      phase = SCAN_NODE_COUNT ;

      // Exclusive scan of row_count into row_map
      // including the final total in the 'node_count + 1' position.
      // Zero the 'row_count' values.
      Kokkos::parallel_scan( residual.extent(0) , *this );

      // Zero the row count for the fill:
      Kokkos::deep_copy( row_count , typename RowMapType::value_type(0) );

      unsigned graph_entry_count = 0 ;

      Kokkos::deep_copy( graph_entry_count , row_total );

      // Assign graph's row_map and allocate graph's entries
      graph.row_map = row_map ;

      typedef typename CrsGraphType::entries_type graph_entries_type ;

      graph.entries = graph_entries_type( "graph_entries" , graph_entry_count );

      //--------------------------------
      // Fill graph's entries from the (node,node) set.

      phase = FILL_GRAPH_ENTRIES ;

      Kokkos::deep_copy( row_count , 0u );
      Kokkos::parallel_for( elem_node_id.extent(0) , *this );

      execution_space().fence();

      //--------------------------------
      // Done with the temporary sets and arrays

      row_total = UnsignedValue();
      row_count = RowMapType();
      row_map   = RowMapType();

      //--------------------------------

      phase = SORT_GRAPH_ENTRIES ;
      Kokkos::parallel_for( residual.extent(0) , *this );

      execution_space().fence();

      phase = GATHER_FILL ;
    }

  void apply() const
  {
    Kokkos::parallel_for( residual.extent(0) , *this );
  }

  //------------------------------------
  //------------------------------------
  // parallel_for: Count node->element pairs

  KOKKOS_INLINE_FUNCTION
  void fill_node_count( const unsigned ielem ) const
  {
    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) {

      const unsigned row_node = elem_node_id( ielem , row_local_node );

      if ( row_node < row_count.extent(0) ) {
        atomic_fetch_add( & row_count( row_node ) , 1 );
      }
    }
  }

  KOKKOS_INLINE_FUNCTION
  void fill_graph_entries( const unsigned ielem ) const
  {
    for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) {

      const unsigned row_node = elem_node_id( ielem , row_local_node );

      if ( row_node < row_count.extent(0) ) {

        const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 );

        graph.entries( offset , 0 ) = ielem ;
        graph.entries( offset , 1 ) = row_local_node ;
      }
    }
  }

  KOKKOS_INLINE_FUNCTION
  void sort_graph_entries( const unsigned irow ) const
  {
    const unsigned row_beg = graph.row_map( irow );
    const unsigned row_end = graph.row_map( irow + 1 );
    for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) {
      const unsigned elem  = graph.entries(i,0);
      const unsigned local = graph.entries(i,1);
      unsigned j = i ;
      for ( ; row_beg < j && elem < graph.entries(j-1,0) ; --j ) {
        graph.entries(j,0) = graph.entries(j-1,0);
        graph.entries(j,1) = graph.entries(j-1,1);
      }
      graph.entries(j,0) = elem ;
      graph.entries(j,1) = local ;
    }
  }

  //------------------------------------

  KOKKOS_INLINE_FUNCTION
  void gather_fill( const unsigned irow ) const
  {
    const unsigned node_elem_begin = graph.row_map(irow);
    const unsigned node_elem_end   = graph.row_map(irow+1);

    //  for each element that a node belongs to

    for ( unsigned i = node_elem_begin ; i < node_elem_end ; i++ ) {

      const unsigned elem_id   = graph.entries( i, 0);
      const unsigned row_index = graph.entries( i, 1);

      residual(irow) += elem_residual(elem_id, row_index);

      //  for each node in a particular related element
      //  gather the contents of the element stiffness
      //  matrix that belong in irow

      for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) {
        const unsigned A_index = elem_graph( elem_id , row_index , j );

        jacobian.coeff( A_index ) += elem_jacobian( elem_id, row_index, j );
      }
    }
  }

  //------------------------------------

  KOKKOS_INLINE_FUNCTION
  void operator()( const unsigned iwork ) const
  {
    if ( phase == FILL_NODE_COUNT ) {
      fill_node_count( iwork );
    }
    else if ( phase == FILL_GRAPH_ENTRIES ) {
      fill_graph_entries( iwork );
    }
    else if ( phase == SORT_GRAPH_ENTRIES ) {
      sort_graph_entries( iwork );
    }
    else if ( phase == GATHER_FILL ) {
      gather_fill( iwork );
    }
  }

  //------------------------------------
  // parallel_scan: row offsets

  typedef unsigned value_type ;

  KOKKOS_INLINE_FUNCTION
  void operator()( const unsigned irow , unsigned & update , const bool final ) const
  {
    // exclusive scan
    if ( final ) { row_map( irow ) = update ; }

    update += row_count( irow );

    if ( final ) {
      if ( irow + 1 == row_count.extent(0) ) {
        row_map( irow + 1 ) = update ;
        row_total()         = update ;
      }
    }
  }

  KOKKOS_INLINE_FUNCTION
  void init( unsigned & update ) const { update = 0 ; }

  KOKKOS_INLINE_FUNCTION
  void join( volatile unsigned & update , const volatile unsigned & input ) const { update += input ; }
};

} /* namespace FENL */
} /* namespace Example */
} /* namespace Kokkos  */

//----------------------------------------------------------------------------

namespace Kokkos {
namespace Example {
namespace FENL {

template< class FiniteElementMeshType , class SparseMatrixType >
class ElementComputation ;


template< class ExecSpace , BoxElemPart::ElemOrder Order , class CoordinateMap , typename ScalarType >
class ElementComputation<
  Kokkos::Example::BoxElemFixture< ExecSpace , Order , CoordinateMap > ,
  Kokkos::Example::CrsMatrix< ScalarType , ExecSpace > >
{
public:

  typedef Kokkos::Example::BoxElemFixture< ExecSpace, Order, CoordinateMap >  mesh_type ;
  typedef Kokkos::Example::HexElement_Data< mesh_type::ElemNode >             element_data_type ;

  typedef Kokkos::Example::CrsMatrix< ScalarType , ExecSpace >  sparse_matrix_type ;
  typedef typename sparse_matrix_type::StaticCrsGraphType       sparse_graph_type ;

  typedef ExecSpace   execution_space ;
  typedef ScalarType  scalar_type ;

  static const unsigned SpatialDim       = element_data_type::spatial_dimension ;
  static const unsigned TensorDim        = SpatialDim * SpatialDim ;
  static const unsigned ElemNodeCount    = element_data_type::element_node_count ;
  static const unsigned FunctionCount    = element_data_type::function_count ;
  static const unsigned IntegrationCount = element_data_type::integration_count ;

  //------------------------------------

  typedef typename mesh_type::node_coord_type                                      node_coord_type ;
  typedef typename mesh_type::elem_node_type                                       elem_node_type ;
  typedef Kokkos::View< scalar_type*[FunctionCount][FunctionCount] , execution_space > elem_matrices_type ;
  typedef Kokkos::View< scalar_type*[FunctionCount] ,                execution_space > elem_vectors_type ;
  typedef Kokkos::View< scalar_type* ,                               execution_space > vector_type ;

  typedef typename NodeNodeGraph< elem_node_type , sparse_graph_type , ElemNodeCount >::ElemGraphType elem_graph_type ;

  //------------------------------------


  //------------------------------------
  // Computational data:

  const element_data_type   elem_data ;
  const elem_node_type      elem_node_ids ;
  const node_coord_type     node_coords ;
  const elem_graph_type     elem_graph ;
  const elem_matrices_type  elem_jacobians ;
  const elem_vectors_type   elem_residuals ;
  const vector_type         solution ;
  const vector_type         residual ;
  const sparse_matrix_type  jacobian ;
  const scalar_type         coeff_K ;

  ElementComputation( const ElementComputation & rhs )
    : elem_data()
    , elem_node_ids( rhs.elem_node_ids )
    , node_coords(   rhs.node_coords )
    , elem_graph(    rhs.elem_graph )
    , elem_jacobians( rhs.elem_jacobians )
    , elem_residuals( rhs.elem_residuals )
    , solution( rhs.solution )
    , residual( rhs.residual )
    , jacobian( rhs.jacobian )
    , coeff_K( rhs.coeff_K )
    {}

  // If the element->sparse_matrix graph is provided then perform atomic updates
  // Otherwise fill per-element contributions for subequent gather-add into a residual and jacobian.
  ElementComputation( const mesh_type          & arg_mesh ,
	              const scalar_type          arg_coeff_K ,
                      const vector_type        & arg_solution ,
                      const elem_graph_type    & arg_elem_graph ,
                      const sparse_matrix_type & arg_jacobian ,
                      const vector_type        & arg_residual )
    : elem_data()
    , elem_node_ids( arg_mesh.elem_node() )
    , node_coords(   arg_mesh.node_coord() )
    , elem_graph(    arg_elem_graph )
    , elem_jacobians()
    , elem_residuals()
    , solution( arg_solution )
    , residual( arg_residual )
    , jacobian( arg_jacobian )
    , coeff_K( arg_coeff_K )
    {}

  ElementComputation( const mesh_type    & arg_mesh ,
	              const scalar_type    arg_coeff_K ,
                      const vector_type  & arg_solution )
    : elem_data()
    , elem_node_ids( arg_mesh.elem_node() )
    , node_coords(   arg_mesh.node_coord() )
    , elem_graph()
    , elem_jacobians( "elem_jacobians" , arg_mesh.elem_count() )
    , elem_residuals( "elem_residuals" , arg_mesh.elem_count() )
    , solution( arg_solution )
    , residual()
    , jacobian()
    , coeff_K( arg_coeff_K )
    {}

  //------------------------------------

  void apply() const
  {
    parallel_for( elem_node_ids.extent(0) , *this );
  }

  //------------------------------------

  static const unsigned FLOPS_transform_gradients =
     /* Jacobian */           FunctionCount * TensorDim * 2 +
     /* Inverse jacobian */   TensorDim * 6 + 6 +
     /* Gradient transform */ FunctionCount * 15 ;

  KOKKOS_INLINE_FUNCTION
  float transform_gradients(
    const float grad[][ FunctionCount ] , // Gradient of bases master element
    const double x[] ,
    const double y[] ,
    const double z[] ,
    float dpsidx[] ,
    float dpsidy[] ,
    float dpsidz[] ) const
  {
    enum { j11 = 0 , j12 = 1 , j13 = 2 ,
           j21 = 3 , j22 = 4 , j23 = 5 ,
           j31 = 6 , j32 = 7 , j33 = 8 };

    // Jacobian accumulation:

    double J[ TensorDim ] = { 0, 0, 0,  0, 0, 0,  0, 0, 0 };

    for( unsigned i = 0; i < FunctionCount ; ++i ) {
      const double x1 = x[i] ;
      const double x2 = y[i] ;
      const double x3 = z[i] ;

      const float g1 = grad[0][i] ;
      const float g2 = grad[1][i] ;
      const float g3 = grad[2][i] ;

      J[j11] += g1 * x1 ;
      J[j12] += g1 * x2 ;
      J[j13] += g1 * x3 ;

      J[j21] += g2 * x1 ;
      J[j22] += g2 * x2 ;
      J[j23] += g2 * x3 ;

      J[j31] += g3 * x1 ;
      J[j32] += g3 * x2 ;
      J[j33] += g3 * x3 ;
    }

    // Inverse jacobian:

    float invJ[ TensorDim ] = {
      static_cast<float>( J[j22] * J[j33] - J[j23] * J[j32] ) ,
      static_cast<float>( J[j13] * J[j32] - J[j12] * J[j33] ) ,
      static_cast<float>( J[j12] * J[j23] - J[j13] * J[j22] ) ,

      static_cast<float>( J[j23] * J[j31] - J[j21] * J[j33] ) ,
      static_cast<float>( J[j11] * J[j33] - J[j13] * J[j31] ) ,
      static_cast<float>( J[j13] * J[j21] - J[j11] * J[j23] ) ,

      static_cast<float>( J[j21] * J[j32] - J[j22] * J[j31] ) ,
      static_cast<float>( J[j12] * J[j31] - J[j11] * J[j32] ) ,
      static_cast<float>( J[j11] * J[j22] - J[j12] * J[j21] ) };

    const float detJ = J[j11] * invJ[j11] +
                       J[j21] * invJ[j12] +
                       J[j31] * invJ[j13] ;

    const float detJinv = 1.0 / detJ ;

    for ( unsigned i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; }

    // Transform gradients:

    for( unsigned i = 0; i < FunctionCount ; ++i ) {
      const float g0 = grad[0][i];
      const float g1 = grad[1][i];
      const float g2 = grad[2][i];

      dpsidx[i] = g0 * invJ[j11] + g1 * invJ[j12] + g2 * invJ[j13];
      dpsidy[i] = g0 * invJ[j21] + g1 * invJ[j22] + g2 * invJ[j23];
      dpsidz[i] = g0 * invJ[j31] + g1 * invJ[j32] + g2 * invJ[j33];
    }

    return detJ ;
  }

  KOKKOS_INLINE_FUNCTION
  void contributeResidualJacobian(
    const float coeff_k ,
    const double dof_values[] ,
    const float dpsidx[] ,
    const float dpsidy[] ,
    const float dpsidz[] ,
    const float detJ ,
    const float integ_weight ,
    const float bases_vals[] ,
    double elem_res[] ,
    double elem_mat[][ FunctionCount ] ) const
  {
    double value_at_pt = 0 ;
    double gradx_at_pt = 0 ;
    double grady_at_pt = 0 ;
    double gradz_at_pt = 0 ;

    for ( unsigned m = 0 ; m < FunctionCount ; m++ ) {
      value_at_pt += dof_values[m] * bases_vals[m] ;
      gradx_at_pt += dof_values[m] * dpsidx[m] ;
      grady_at_pt += dof_values[m] * dpsidy[m] ;
      gradz_at_pt += dof_values[m] * dpsidz[m] ;
    }

    const scalar_type k_detJ_weight = coeff_k        * detJ * integ_weight ;
    const double res_val = value_at_pt * value_at_pt * detJ * integ_weight ;
    const double mat_val = 2.0 * value_at_pt         * detJ * integ_weight ;

    // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$
    // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$

    for ( unsigned m = 0; m < FunctionCount; ++m) {
      double * const mat = elem_mat[m] ;
      const float bases_val_m = bases_vals[m];
      const float dpsidx_m    = dpsidx[m] ;
      const float dpsidy_m    = dpsidy[m] ;
      const float dpsidz_m    = dpsidz[m] ;

      elem_res[m] += k_detJ_weight * ( dpsidx_m * gradx_at_pt +
                                       dpsidy_m * grady_at_pt +
                                       dpsidz_m * gradz_at_pt ) +
                     res_val * bases_val_m ;

      for( unsigned n = 0; n < FunctionCount; n++) {

        mat[n] += k_detJ_weight * ( dpsidx_m * dpsidx[n] +
                                    dpsidy_m * dpsidy[n] +
                                    dpsidz_m * dpsidz[n] ) +
                  mat_val * bases_val_m * bases_vals[n];
      }
    }
  }

  KOKKOS_INLINE_FUNCTION
  void operator()( const unsigned ielem ) const
  {
    // Gather nodal coordinates and solution vector:

    double x[ FunctionCount ] ;
    double y[ FunctionCount ] ;
    double z[ FunctionCount ] ;
    double val[ FunctionCount ] ;
    unsigned node_index[ ElemNodeCount ];

    for ( unsigned i = 0 ; i < ElemNodeCount ; ++i ) {
      const unsigned ni = elem_node_ids( ielem , i );

      node_index[i] = ni ;

      x[i] = node_coords( ni , 0 );
      y[i] = node_coords( ni , 1 );
      z[i] = node_coords( ni , 2 );

      val[i] = solution( ni );
    }


    double elem_vec[ FunctionCount ] ;
    double elem_mat[ FunctionCount ][ FunctionCount ] ;

    for( unsigned i = 0; i < FunctionCount ; i++ ) {
      elem_vec[i] = 0 ;
      for( unsigned j = 0; j < FunctionCount ; j++){
        elem_mat[i][j] = 0 ;
      }
    }


    for ( unsigned i = 0 ; i < IntegrationCount ; ++i ) {
      float dpsidx[ FunctionCount ] ;
      float dpsidy[ FunctionCount ] ;
      float dpsidz[ FunctionCount ] ;

      const float detJ =
        transform_gradients( elem_data.gradients[i] , x , y , z ,
                             dpsidx , dpsidy , dpsidz );

      contributeResidualJacobian( coeff_K ,
                                  val , dpsidx , dpsidy , dpsidz ,
                                  detJ ,
                                  elem_data.weights[i] ,
                                  elem_data.values[i] ,
                                  elem_vec , elem_mat );
    }

#if 0

if ( 1 == ielem ) {
  printf("ElemResidual { %f %f %f %f %f %f %f %f }\n",
         elem_vec[0], elem_vec[1], elem_vec[2], elem_vec[3],
         elem_vec[4], elem_vec[5], elem_vec[6], elem_vec[7]);

  printf("ElemJacobian {\n");

  for ( unsigned j = 0 ; j < FunctionCount ; ++j ) {
  printf("  { %f %f %f %f %f %f %f %f }\n",
         elem_mat[j][0], elem_mat[j][1], elem_mat[j][2], elem_mat[j][3],
         elem_mat[j][4], elem_mat[j][5], elem_mat[j][6], elem_mat[j][7]);
  }
  printf("}\n");
}

#endif

    if ( ! residual.extent(0) ) {
      for( unsigned i = 0; i < FunctionCount ; i++){
        elem_residuals(ielem, i) = elem_vec[i] ;
        for( unsigned j = 0; j < FunctionCount ; j++){
          elem_jacobians(ielem, i, j) = elem_mat[i][j] ;
        }
      }
    }
    else {
      for( unsigned i = 0 ; i < FunctionCount ; i++ ) {
        const unsigned row = node_index[i] ;
        if ( row < residual.extent(0) ) {
          atomic_fetch_add( & residual( row ) , elem_vec[i] );

          for( unsigned j = 0 ; j < FunctionCount ; j++ ) {
            const unsigned entry = elem_graph( ielem , i , j );
            if ( entry != ~0u ) {
              atomic_fetch_add( & jacobian.coeff( entry ) , elem_mat[i][j] );
            }
          }
        }
      }
    }
  }
}; /* ElementComputation */

//----------------------------------------------------------------------------

template< class FixtureType , class SparseMatrixType >
class DirichletComputation ;

template< class ExecSpace , BoxElemPart::ElemOrder Order , class CoordinateMap , typename ScalarType >
class DirichletComputation<
  Kokkos::Example::BoxElemFixture< ExecSpace , Order , CoordinateMap > ,
  Kokkos::Example::CrsMatrix< ScalarType , ExecSpace > >
{
public:

  typedef Kokkos::Example::BoxElemFixture< ExecSpace, Order, CoordinateMap >  mesh_type ;
  typedef typename mesh_type::node_coord_type                                 node_coord_type ;
  typedef typename node_coord_type::value_type                                scalar_coord_type ;

  typedef Kokkos::Example::CrsMatrix< ScalarType , ExecSpace >  sparse_matrix_type ;
  typedef typename sparse_matrix_type::StaticCrsGraphType       sparse_graph_type ;

  typedef ExecSpace   execution_space ;
  typedef ScalarType  scalar_type ;

  //------------------------------------

  typedef Kokkos::View< scalar_type* , execution_space > vector_type ;

  //------------------------------------
  // Computational data:

  const node_coord_type     node_coords ;
  const vector_type         solution ;
  const sparse_matrix_type  jacobian ;
  const vector_type         residual ;
  const scalar_type         bc_lower_value ;
  const scalar_type         bc_upper_value ;
  const scalar_coord_type   bc_lower_limit ;
  const scalar_coord_type   bc_upper_limit ;
  const unsigned            bc_plane ;
  const unsigned            node_count ;
        bool                init ;


  DirichletComputation( const mesh_type          & arg_mesh ,
                        const vector_type        & arg_solution ,
                        const sparse_matrix_type & arg_jacobian ,
                        const vector_type        & arg_residual ,
                        const unsigned             arg_bc_plane ,
                        const scalar_type          arg_bc_lower_value ,
                        const scalar_type          arg_bc_upper_value )
    : node_coords( arg_mesh.node_coord() )
    , solution(    arg_solution )
    , jacobian(    arg_jacobian )
    , residual(    arg_residual )
    , bc_lower_value( arg_bc_lower_value )
    , bc_upper_value( arg_bc_upper_value )
    , bc_lower_limit( std::numeric_limits<scalar_coord_type>::epsilon() )
    , bc_upper_limit( scalar_coord_type(1) - std::numeric_limits<scalar_coord_type>::epsilon() )
    , bc_plane(       arg_bc_plane )
    , node_count( arg_mesh.node_count_owned() )
    , init( false )
    {
      parallel_for( node_count , *this );
      init = true ;
    }

  void apply() const
  {
    parallel_for( node_count , *this );
  }

  //------------------------------------

  KOKKOS_INLINE_FUNCTION
  void operator()( const unsigned inode ) const
  {
    //  Apply dirichlet boundary condition on the Solution and Residual vectors.
    //  To maintain the symmetry of the original global stiffness matrix,
    //  zero out the columns that correspond to boundary conditions, and
    //  update the residual vector accordingly

    const unsigned iBeg = jacobian.graph.row_map[inode];
    const unsigned iEnd = jacobian.graph.row_map[inode+1];

    const scalar_coord_type c = node_coords(inode,bc_plane);
    const bool bc_lower = c <= bc_lower_limit ;
    const bool bc_upper = bc_upper_limit <= c ;

    if ( ! init ) {
      solution(inode) = bc_lower ? bc_lower_value : (
                        bc_upper ? bc_upper_value : 0 );
    }
    else {
      if ( bc_lower || bc_upper ) {

        residual(inode) = 0 ;

        //  zero each value on the row, and leave a one
        //  on the diagonal

        for( unsigned i = iBeg ; i < iEnd ; ++i ) {
          jacobian.coeff(i) = int(inode) == int(jacobian.graph.entries(i)) ? 1 : 0 ;
        }
      }
      else {

        //  Find any columns that are boundary conditions.
        //  Clear them and adjust the residual vector

        for( unsigned i = iBeg ; i < iEnd ; ++i ) {
          const unsigned       cnode = jacobian.graph.entries(i) ;
          const scalar_coord_type cc = node_coords(cnode,bc_plane);

          if ( ( cc <= bc_lower_limit ) || ( bc_upper_limit <= cc ) ) {
            jacobian.coeff(i) = 0 ;
          }
        }
      }
    }
  }
};

} /* namespace FENL */
} /* namespace Example */
} /* namespace Kokkos  */

//----------------------------------------------------------------------------

/* A Cuda-specific specialization for the element computation functor. */
#if defined( __CUDACC__ )
// #include <NonlinearElement_Cuda.hpp>
#endif

//----------------------------------------------------------------------------

#endif /* #ifndef KOKKOS_EXAMPLE_FENLFUNCTORS_HPP */

